library(ISLR)
attach(Weekly)
summary(Weekly)
par(mfrow=c(2,3))
hist(Weekly$Lag1)
hist(Weekly$Lag2)
hist(Weekly$Lag3)
hist(Weekly$Lag4)
hist(Weekly$Lag5)
hist(Weekly$Volume)
log1 <- glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data=Weekly, family=binomial)
summary(log1)
Lag2 seems to be statistically significant with p < 0.05
log1.pred <- predict(log1, type="response")
log1.response <- rep("Down", nrow(Weekly))
log1.response[log1.pred > 0.5] = "Up"
table(log1.response, Weekly$Direction)
mean(log1.response == Weekly$Direction)
(54+557)/1089
100 - 56.1 = 43.9 is the training error rate. The training error rate is often overly optimistic - it tends to underestimate the test error rate. In order to better assess the accuracy of the logistic regression model in this setting, we can fit the model using part of the data, and then examine how well it predicts the held out data. This will yield a more realistic error rate, in the sense that in practice we will be interested in our model's performance not on the data that we used to fit the model, but rather on days in the future for which the market's movements are unknown.
training <- Weekly$Year < 2009
Weekly.2009 <- Weekly[!training,]
dim(Weekly.2009)
direction.2009 <- Weekly$Direction[!training]
glm.fit <- glm(Direction~Lag2, data=Weekly, family=binomial, subset=training)
glm.probs = predict(glm.fit, Weekly.2009, type="response")
glm.pred <- rep("Down", length(direction.2009))
glm.pred[glm.probs > 0.5] = "Up"
table(glm.pred, direction.2009)
mean(glm.pred == direction.2009)
library(MASS)
lda.fit = lda(Direction ~ Lag2, data = Weekly, subset = training)
lda.pred = predict(lda.fit, Weekly.2009)
table(lda.pred$class, direction.2009)
qda.fit = qda(Direction ~ Lag2, data = Weekly, subset = training)
qda.class = predict(qda.fit, Weekly.2009)$class
table(qda.class, direction.2009)
mean(qda.class==direction.2009)
train.X = as.matrix(Lag2[training])
test.X = as.matrix(Lag2[!training])
train.Y = as.matrix(Direction[training])
library(class)
set.seed(1)
knn.pred = knn(train.X, test.X, train.Y, k=1)
table(knn.pred, direction.2009)
mean(knn.pred == direction.2009)
LDA provides the best result on this data
Auto$mpg01 <- rep(0, nrow(Auto))
Auto$mpg01[Auto$mpg > median(Auto$mpg)] = 1
nrow(Auto)
library(corrplot)
cor(Auto[,-9])
pairs(Auto)